##################################################### # Script to create network graphs of genes shared between GO Slim terms. # Requires a matrix or data frame (read in from a file) # containing Swiss-Pro IDs (Gene_ID), GO domain (GO.domain2), and GO Slim terms (GO.slim). # (If column names are different, code will have to be changed appropriately.) # Creates data frames with the number of genes per GO Slim terms and # the number of genes shared between GO-slim terms and # uses these matrices to create network graphs using iGraph. # # Author: Sonia Singhal, 23 Nov 2012 # singhal@u.washington.edu ##################################################### library(igraph) # Function to get a vector of unique GO Slim terms from a data table. getGOslimNames <- function(GOslim_data){ return (unique(GOslim_data$GO.slim)) } # For each GO Slim term, count the number of genes involved getGenesPerSlim <- function(GOslim_data){ GOslim <- getGOslimNames(GOslim_data) genes_per_slim <- data.frame(GOslim = GOslim, num_genes = rep(0, length(GOslim))) for (s in GOslim){ sub <- subset(GOslim_data, GO.slim == s, select = c(Gene_ID, GO.slim)) genes_per_slim$num_genes[genes_per_slim$GOslim == s] <- length(unique(sub$Gene_ID)) } return (genes_per_slim) } # Create an adjacency matrix from shared genes between GO-slim terms. makeAdjancencyMat <- function(GOslim_data){ # Initialize the network adjacency matrix. GOslim <- getGOslimNames(GOslim_data) GOslim_mat <- matrix(nrow = length(GOslim), ncol = length(GOslim), data = 0) dimnames(GOslim_mat)[[1]] <- GOslim dimnames(GOslim_mat)[[2]] <- GOslim # For each gene, count the number of GO-slim terms # Insert the values in the matrix genes <- unique(GOslim_data$Gene_ID) for(g in genes){ sub <-subset(GOslim_data, Gene_ID == g, select = c(Gene_ID, GO.slim)) slim_terms <- unique(sub$GO.slim) if(length(slim_terms) > 1){ for(term1 in slim_terms){ for(term2 in slim_terms){ if (term1 == term2){ next } GOslim_mat[which(GOslim == term1), which(GOslim == term2)] <- GOslim_mat[which(GOslim == term1), which(GOslim == term2)] + 1 } } } } return (GOslim_mat) } # Initialize the graph makeIGraph <- function(GOslim_mat, genes_per_slim){ g <- graph.adjacency(GOslim_mat, mode = "undirected", weighted = TRUE) g <- set.vertex.attribute(g, "name", value = as.character(genes_per_slim$GOslim)) g <- set.vertex.attribute(g, "number", value = genes_per_slim$num_genes) return (g) } # Read in data. Filename will need to be changed for a different file/ computer. filename <- "C:/Users/Sonia/Documents/FHL/ID for DEG in QPX (trimmed).txt" diff_gene_data <- read.delim(filename, stringsAsFactors = FALSE) # extract genes involved in biological processes proc_data <- subset(diff_gene_data, GO.domain2 == "P") # Remove "other" categories in the GO-slim terms proc_data <- subset(proc_data, GO.slim != "other biological processes") proc_data <- subset(proc_data, GO.slim != "other metabolic processes") # Find the number of genes per GO-slim term proc_genes_per_slim <- getGenesPerSlim(proc_data) # Create the adjacency matrix of connections GOslim_proc_mat <- makeAdjancencyMat(proc_data) # Initialize the graph proc_graph <- makeIGraph(GOslim_proc_mat, proc_genes_per_slim) # add carriage returns into the two longest names V(proc_graph)$name[2] <- "cell cycle and\nproliferation" V(proc_graph)$name[3] <- "cell organization\nand biogenesis" # The figures that appear in the paper were created first # by plotting with the tkplot function in iGraph, which allows for # interactive arrangement of the nodes in the graph. # The position of each node was obtained by querying the plot for the coordinates # and tweaked in the basic plotting function (igraph.plot). # Sample functions: tkplot(proc_graph, vertex.label = V(proc_graph)$name, vertex.size = log(V(proc_graph)$number)*10, edge.width = E(proc_graph)$weight, edge.label = E(proc_graph)$weight, edge.color = "black") proc_coords <- tkplot.getcoords(2) # 2 is the number of the particular tkplot. # Visualize graph in a TIFF file dev.set(dev.next()) tiff("proc_network.tiff") plot.igraph(proc_graph, vertex.label = V(proc_graph)$name, vertex.size = log(V(proc_graph)$number)*10, # Scale node size by the number of genes vertex.label.degree = c(0, 0, 0, pi/2, pi, pi, pi, pi, -pi/2, 0, 0), # Arrange the labels around the circumference of the graph. Starts at "Transport" and moves clockwise vertex.label.dist = c(2, 1.3, 1.3, 0.5, 2.3, 1.7, 2, 2, 0.5, 1.1, 1.1), # Move the labels away from the center of each node vertex.label.color = "black", edge.width = E(proc_graph)$weight, # Change width of connections based on the number of shared genes layout = proc_coords2, # Use the coordinates from the tkplot to arrange nodes edge.color = "black") dev.off()